Read counts data

Load and transpose counts file (Sample IDs as header and miRNA IDs as first column)

oriCountsFile <- read.csv(file = './mature_counts.csv')
countsFile <- as.data.frame(t(oriCountsFile)) # transpose oriCountsFile
countsFile <- janitor::row_to_names(countsFile, row_number=1)
countsFile <- tibble::rownames_to_column(countsFile, var="miR")
countsFile$miR <- chartr('.', '-', countsFile$miR) # replace '.' with '-' in miR
countsFile

Update the IDs

update <- miRNAVersionConvert(countsFile$miR)
head(update)
length(unique(countsFile$miR))
length(unique(update[,"OriginalName"]))
length(unique(update[,"TargetName"]))

Fetch the mirbase22 IDs and sequences

mrbse <- read.table("../../miRNA/mirbase/mature_homo-sapiens_dataframe.txt", header = T, sep = "\t")
mrbse <- mrbse[-c(1,2),] # The first two rows don't match up with mirbase, are the only duplicates, and are the only sequences with T's for some reason. I'll remove them.
head(mrbse)

Create microRNA Mapping table and Update Countfile miRNA Row Names

mrbse2 <- merge(mrbse, update, by.x = 1, by.y = 2, all.x = F, all.y = T)
mrbse2 <- mrbse2[,-4]
dim(mrbse2)
[1] 2423    3
head(mrbse2)
rownames(mrbse2)<- mrbse2[,"OriginalName"]
rownames(countsFile)<-countsFile[,1]
rownames(countsFile)<- mrbse2[rownames(countsFile), "Name"]

Create Row for each miRNA and Merge this with the data

mergedData <- matrix(0, nrow = dim(mrbse)[1], ncol= 302)
mergedData <- data.frame(mergedData)
mergedData<- cbind(mrbse, mergedData)
colnames(mergedData)<- c("Name", "Seq", colnames(countsFile)[-1])
rownames(mergedData)<-mergedData[,1]
mergedData[rownames(countsFile), colnames(countsFile)[-1]]<- countsFile[,-1]
#mergedData <- merge(mrbse2, countsFile, by.x = 3, by.y = 1, all.x = T, all.y = F)
dim(mergedData)
[1] 2656  304
head(mergedData)
#mergedData <- mergedData[,-1] # Remove the original (old) IDs
mergedData[1:5,1:5]
mergedData

Quintize the data

For each column: assign each miRNA a value 1-5 depending on which of the 20th percentiles it falls into. If the value is 0, it remains a 0

# Given a column, assign a number to each element from 0-5. All
# 0s get a 0, and the rest get a value according to the 20th percentile
# that it falls in among the non-zero values.
quintize <- function(vec) {
  qntls <- c(0, quantile(vec[which(vec != 0)], 0.2*(1:4)))
  vec2 <- sapply(vec, function(x) {
    if (x == 0) return(0)
    else return(max(which(x > qntls)))
  })
  return(vec2)
}
mergedData[,3:ncol(mergedData)] = lapply(mergedData[,3:ncol(mergedData)], FUN = function(y){as.numeric(y)})
quintizedData <- apply(mergedData[,-c(1:2)], 2, quintize)
quintizedData <- cbind(mergedData[,1:2], quintizedData)
quintizedData[1:5,1:5]

Combine the replicates

Get the meta information

meta <- read.table("./meta.txt", header = T, sep = "\t")
meta$Sample_ID <- gsub("BioSample: https://www.ncbi.nlm.nih.gov/biosample/", "", meta$X.Sample_relation) # extract only SAMN IDs from X.Sample_relation

Create Disease, Tissue, and Group (Tissue_Disease) columns

meta$Tissue <- str_replace(meta$X.Sample_source_name_ch1, "tissue: ", "")
meta$Tissue <- str_replace_all(meta$Tissue, " ", ".")
meta$Tissue <- tolower(meta$Tissue)
meta$Disease <- str_replace_all(meta$X.Sample_characteristics_ch1, "disease status: ", "")
meta$Disease <- str_replace_all(meta$Disease, " ", ".")
meta$Group <- apply(meta[,c(ncol(meta)-1,ncol(meta))], 1, function(x) paste(x[1], x[2], sep="_"))
meta$Group <- str_replace_all(meta$Group, "_Adjacent.Normal", "")
meta$Disease <- str_replace_all(meta$Disease, "Adjacent Normal", "")

Grouping: Combine replicates (samples that have the same type)

uniqueTissues <- unique(meta$Group)
meta$Sample_ID %in% names(quintizedData)
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[28] TRUE TRUE TRUE TRUE TRUE
combinedData <- lapply(uniqueTissues, function(x) {
  df <- as.data.frame(quintizedData[,meta$Sample_ID[which(meta$Group == x)]])
  return(rowMeans(df))
})
names(combinedData) <- uniqueTissues
combinedData <- do.call(cbind, combinedData)
combinedData <- cbind(quintizedData[,1:2], combinedData)

Ensure the terms are standardized

Ensure that all the group names appear in the ontology or the corrections file.

Get the ontology and correction files

ont <- read.table("../ontology.txt", header = F, sep = "\t")
corr <- read.table("../corrections.txt", header = T, sep = "\t")

Aim to have 0 as a result, meaning all terms are either already in the ontology or the correction file. Or else: update correction file or ontology file, or both.

trms <- unique(unlist(strsplit(names(combinedData)[3:ncol(combinedData)], "_"))) # Splits the composite terms that contain both tissue+disease
if (length(which(trms %in% union(corr[,1], unique(unlist(ont[,c(1,3)]))))) != 0){
  trms[-which(trms %in% union(corr[,1], unique(unlist(ont[,c(1,3)]))))] # Which terms aren't in the ontology or the corrected terms
} else {
  trms
}
character(0)

Found 2 new terms: HCC, Adjacent.Normal. Terms are updated in ontology file and correction file, we just need to update them with the corrected terms.

colnames(combinedData) <- sapply(colnames(combinedData), function(z) {
  y <- strsplit(z, "_")[[1]]
  retVal <- sapply(y, function(x) {
    if (x %in% corr$currentTerm) {
      return(corr$correctedTerm[match(x, corr$currentTerm)])
    } else {
      return(x)
    }
  })
  return(paste(retVal, collapse = "_"))
})

Convert to long format

Add the Canonical column and Source column

data <- combinedData # all tissues have to exist in ontology or correction files
data$Canonical <- sapply(data$Seq, function(x) if (x %in% mrbse$Seq) return(T) else return(F))
data$Source <- "Varghese*"
data <- data[,c(1,2,ncol(data)-1,ncol(data),3:(ncol(data)-2))] # Set columns alignment
head(data)

Convert to long format

data_long <- melt(data, id.vars=c("Name", "Seq", "Canonical", "Source"))

Binarization

data_long$Binary <- sapply(data_long$value, function(x) if (x == 0) return(0) else return(1))
names(data_long) <- c("miR", "Seq", "Canonical", "Source", "Tissue", "Scale", "Binary")

Write to file

write.table(data_long, paste(sep="", result_path,"/varghese_longData.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
write.table(unique(data_long[,1:3]), paste(sep="", result_path,"/varghese_miRNAs.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
writeLines(as.character(unique(data_long$Tissue)), paste(sep="", result_path,"/varghese_tissues.txt"))
---
title: "Processing Varghese expression data"
author: "Gitta Ekaputeri, Anne-Christin Hauschild"
date: "29/08/2022" 
output: html_notebook
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(stringr)
library(miRBaseConverter)
library(reshape2)
root_path <- "./"
result_path <- paste(root_path, "/data/miRNA_final", sep="")
knitr::opts_knit$set(root.dir = paste(sep="", root_path, "data/miRNA/vargheseData/"))
setwd(paste(sep="", root_path, "data/miRNA/vargheseData/"))
```

### Read counts data

Load and transpose counts file (Sample IDs as header and miRNA IDs as first column)
```{r}
oriCountsFile <- read.csv(file = './mature_counts.csv')
countsFile <- as.data.frame(t(oriCountsFile)) # transpose oriCountsFile
countsFile <- janitor::row_to_names(countsFile, row_number=1)
countsFile <- tibble::rownames_to_column(countsFile, var="miR")
countsFile$miR <- chartr('.', '-', countsFile$miR) # replace '.' with '-' in miR
countsFile
```

### Update the IDs 

```{r}
update <- miRNAVersionConvert(countsFile$miR)
head(update)
length(unique(countsFile$miR))
length(unique(update[,"OriginalName"]))
length(unique(update[,"TargetName"]))

```

Fetch the mirbase22 IDs and sequences

```{r}
mrbse <- read.table("../../miRNA/mirbase/mature_homo-sapiens_dataframe.txt", header = T, sep = "\t")
mrbse <- mrbse[-c(1,2),] # Remove the first two rows that don't match up with mirbase and are the only duplicates, and are the only sequences with T's for some reason.
head(mrbse)
```

Create microRNA Mapping table and Update Countfile miRNA Row Names
```{r}
mrbse2 <- merge(mrbse, update, by.x = 1, by.y = 2, all.x = F, all.y = T)
mrbse2 <- mrbse2[,-4]
dim(mrbse2)
head(mrbse2)
rownames(mrbse2)<- mrbse2[,"OriginalName"]
rownames(countsFile)<-countsFile[,1]
rownames(countsFile)<- mrbse2[rownames(countsFile), "Name"]
```


Create Row for each miRNA and Merge this with the data

```{r}
mergedData <- matrix(0, nrow = dim(mrbse)[1], ncol= 302)
mergedData <- data.frame(mergedData)
mergedData<- cbind(mrbse, mergedData)
colnames(mergedData)<- c("Name", "Seq", colnames(countsFile)[-1])
rownames(mergedData)<-mergedData[,1]
mergedData[rownames(countsFile), colnames(countsFile)[-1]]<- countsFile[,-1]
#mergedData <- merge(mrbse2, countsFile, by.x = 3, by.y = 1, all.x = T, all.y = F)
dim(mergedData)
head(mergedData)
#mergedData <- mergedData[,-1] # Remove the original (old) IDs
mergedData[1:5,1:5]
mergedData
```




### Quintize the data

For each column: assign each miRNA a value 1-5 depending on which of the 20th percentiles it falls into. If the value is 0, it remains a 0

```{r}
# Given a column, assign a number to each element from 0-5. All
# 0s get a 0, and the rest get a value according to the 20th percentile
# that it falls in among the non-zero values.
quintize <- function(vec) {
  qntls <- c(0, quantile(vec[which(vec != 0)], 0.2*(1:4)))
  vec2 <- sapply(vec, function(x) {
    if (x == 0) return(0)
    else return(max(which(x > qntls)))
  })
  return(vec2)
}
```

```{r}
mergedData[,3:ncol(mergedData)] = lapply(mergedData[,3:ncol(mergedData)], FUN = function(y){as.numeric(y)})
quintizedData <- apply(mergedData[,-c(1:2)], 2, quintize)
quintizedData <- cbind(mergedData[,1:2], quintizedData)
quintizedData[1:5,1:5]
```




### Combine the replicates

Get the meta information

```{r}
meta <- read.table("./meta.txt", header = T, sep = "\t")
meta$Sample_ID <- gsub("BioSample: https://www.ncbi.nlm.nih.gov/biosample/", "", meta$X.Sample_relation) # extract only SAMN IDs from X.Sample_relation
```


Create Disease, Tissue, and Group (Tissue_Disease) columns

```{r}
meta$Tissue <- str_replace(meta$X.Sample_source_name_ch1, "tissue: ", "")
meta$Tissue <- str_replace_all(meta$Tissue, " ", ".")
meta$Tissue <- tolower(meta$Tissue)
meta$Disease <- str_replace_all(meta$X.Sample_characteristics_ch1, "disease status: ", "")
meta$Disease <- str_replace_all(meta$Disease, " ", ".")
meta$Group <- apply(meta[,c(ncol(meta)-1,ncol(meta))], 1, function(x) paste(x[1], x[2], sep="_"))
meta$Group <- str_replace_all(meta$Group, "_Adjacent.Normal", "")
meta$Disease <- str_replace_all(meta$Disease, "Adjacent Normal", "")
```


Grouping: Combine replicates (samples that have the same type)

```{r}
uniqueTissues <- unique(meta$Group)
meta$Sample_ID %in% names(quintizedData)
combinedData <- lapply(uniqueTissues, function(x) {
  df <- as.data.frame(quintizedData[,meta$Sample_ID[which(meta$Group == x)]])
  return(rowMeans(df))
})
names(combinedData) <- uniqueTissues
combinedData <- do.call(cbind, combinedData)
combinedData <- cbind(quintizedData[,1:2], combinedData)
```


### Ensure the terms are standardized
Ensure that all the group names appear in the ontology or the corrections file.

Get the ontology and correction files

```{r}
ont <- read.table("../ontology.txt", header = F, sep = "\t")
corr <- read.table("../corrections.txt", header = T, sep = "\t")
```

Aim to have 0 as a result, meaning all terms are either already in the ontology or the correction file. Or else: update correction file or ontology file, or both.

```{r}
trms <- unique(unlist(strsplit(names(combinedData)[3:ncol(combinedData)], "_"))) # Splits the composite terms that contain both tissue+disease
if (length(which(trms %in% union(corr[,1], unique(unlist(ont[,c(1,3)]))))) != 0){
  trms[-which(trms %in% union(corr[,1], unique(unlist(ont[,c(1,3)]))))] # Which terms aren't in the ontology or the corrected terms
} else {
  trms
}
```
Found 2 new terms: HCC, Adjacent.Normal. Terms are updated in ontology file and correction file, we just need to update them with the corrected terms.

```{r}
colnames(combinedData) <- sapply(colnames(combinedData), function(z) {
  y <- strsplit(z, "_")[[1]]
  retVal <- sapply(y, function(x) {
    if (x %in% corr$currentTerm) {
      return(corr$correctedTerm[match(x, corr$currentTerm)])
    } else {
      return(x)
    }
  })
  return(paste(retVal, collapse = "_"))
})
```

### Convert to long format
Add the Canonical column and Source column

```{r}
data <- combinedData # all tissues have to exist in ontology or correction files
data$Canonical <- sapply(data$Seq, function(x) if (x %in% mrbse$Seq) return(T) else return(F))
data$Source <- "Varghese*"
data <- data[,c(1,2,ncol(data)-1,ncol(data),3:(ncol(data)-2))] # Set columns alignment
head(data)
```


Convert to long format

```{r}
data_long <- melt(data, id.vars=c("Name", "Seq", "Canonical", "Source"))
```

Binarization
```{r}
data_long$Binary <- sapply(data_long$value, function(x) if (x == 0) return(0) else return(1))
names(data_long) <- c("miR", "Seq", "Canonical", "Source", "Tissue", "Scale", "Binary")
```


### Write to file

```{r}
write.table(data_long, paste(sep="", result_path,"/varghese_longData.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
write.table(unique(data_long[,1:3]), paste(sep="", result_path,"/varghese_miRNAs.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
writeLines(as.character(unique(data_long$Tissue)), paste(sep="", result_path,"/varghese_tissues.txt"))
```
